/*    Copyright 2012 Tobias Marschall
 *
 *    This file is part of MoSDi.
 *
 *    MoSDi is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    MoSDi is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with MoSDi.  If not, see <http://www.gnu.org/licenses/>.
 */

package mosdi.subcommands;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import mosdi.util.Alphabet;
import mosdi.util.AminoAcidMasses;
import mosdi.util.Log;
import mosdi.util.NamedSequence;
import mosdi.util.SequenceUtils;

public class CountFragmentMassesSubcommand extends Subcommand {

	@Override
	public String usage() {
		return
		super.usage()+" [options] <proteins.fasta>\n" +
		"\n" +
		"Output format:\n" +
		"<bin-start> <bin-end> <fragment-count> <protein-count>\n" +
		"where <fragment-count> is the total number of observed fragments,\n" +
		"and <protein-count> is the number of proteins that contain at least\n" +
		"on fragment in that mass range.\n" +
		"\n" +
		"Options:\n" +
		"  -d <accuracy>: discretize amino acid masses with given accuracy before\n" +
		"                 summation (default: do not discretize)\n" +
		"  -b <binwidth>: width of histogram bins (default: 1Da)\n" + 
		"  -m <min-mass>: left border of first bin (default: 56.5Da)";
	}

	@Override
	public String description() {
		return "Computes a histogram over observed protein fragment masses for Trypsin cleavage.";
	}

	@Override
	public String name() {
		return "mass-histogram";
	}
	
	@Override
	public int run(String[] args) {
		parseOptions(args, 1, "d:b:m:");

		// Option dependencies
		// -- NONE --

		// Mandatory arguments
		String sequenceFilename = getStringArgument(0);

		// Options
		double massAccuracy = getPositiveDoubleOption("d", -1.0);
		double binWidth = getPositiveDoubleOption("b", 1.0);
		double minMass = getDoubleOption("m", 56.5);

		Alphabet alphabet = Alphabet.getAminoAcidAlphabet();

		List<NamedSequence> namedSequences = null; 
		try {
			namedSequences = SequenceUtils.readFastaFile(sequenceFilename, alphabet);
		} catch (Exception e) {
			Log.errorln(e.toString());
			System.exit(1);
		}

		// maps discretized masses to counts
		// histogram over fragment occurrences
		Map<Integer, Integer> fragmentHistogram = new HashMap<Integer, Integer>();
		// histogram over number of proteins contain at least one fragment of a given mass
		Map<Integer, Integer> massOccHistogram = new HashMap<Integer, Integer>();
		for (NamedSequence ns : namedSequences) {
			// process protein
			double mass = 0.0;
			int intMass = 0;
			int[] protein = ns.getSequence();
			StringBuffer sb = new StringBuffer();
			// set of masses (in terms of there bin index) that occur in the current protein
			Set<Integer> occurringMasses = new HashSet<Integer>();
			for (int i = 0; i<protein.length; ++i) {
				if (massAccuracy > 0.0) {
					intMass += AminoAcidMasses.getMainMass(protein[i],massAccuracy);
				} else{
					mass += AminoAcidMasses.getMainMass(protein[i]);					
				}
				if (Log.levelAtLeast(Log.Level.DEBUG)) sb.append(alphabet.get(protein[i]));
				boolean fragmentEnds;
				if (i == protein.length-1) {
					fragmentEnds = true;
				} else {
					char chr = alphabet.get(protein[i]);
					char nextChr = alphabet.get(protein[i+1]);
					if (((chr == 'K') || (chr == 'R')) && (nextChr != 'P')) {
						fragmentEnds = true; 
					} else {
						fragmentEnds = false;
					}
				}
				if (fragmentEnds) {
					if (Log.levelAtLeast(Log.Level.DEBUG)) sb.append('|');
					if (massAccuracy > 0.0) {
						mass = intMass * massAccuracy;
					}
					if (mass < minMass) {
						Log.errorln(String.format("Error: encountered mass smaller then min-mass (parameter -m): %f Da", mass));
						System.exit(1);
					}
					int bin = (int)((mass - minMass) / binWidth);
					occurringMasses.add(bin);
					if (fragmentHistogram.containsKey(bin)) {
						fragmentHistogram.put(bin, fragmentHistogram.get(bin) + 1);
					} else {
						fragmentHistogram.put(bin, 1);
					}
					mass = 0.0;
					intMass = 0;
				}
			}
			for (int bin : occurringMasses) {
				if (massOccHistogram.containsKey(bin)) {
					massOccHistogram.put(bin, massOccHistogram.get(bin) + 1);
				} else {
					massOccHistogram.put(bin, 1);
				}
			}
			Log.println(Log.Level.DEBUG, sb.toString());
		}
		List<Integer> nonemptyBins = new ArrayList<Integer>(fragmentHistogram.keySet());
		Collections.sort(nonemptyBins);
		for (int bin : nonemptyBins) {
			Log.printf(Log.Level.STANDARD, "%f %f %d %d\n", minMass+(bin*binWidth), minMass+((bin+1)*binWidth), fragmentHistogram.get(bin), massOccHistogram.get(bin));
		}
		return 0;
	}
}
